airq_data <- read.table('C:/AirQualityUCI.csv', sep = ';', header = TRUE, dec = ",")
airq_data <- airq_data %>% select_if(~sum(!is.na(.)) > 0)  %>% drop_na()
airq_data[c('CO.GT.', 'C6H6.GT.', 'T', 'RH', 'AH')] <-  sapply(airq_data[c('CO.GT.', 'C6H6.GT.', 'T', 'RH', 'AH')], as.numeric)
airq_data <- airq_data[, c(3:15)]
airq_data <- sapply(airq_data, function(x){ifelse (x == -200, NA, x)})
airq_data <- as.data.frame(airq_data)
cor_data <- cor(airq_data, method = 'spearman', use = 'pairwise.complete.obs')
corrplot(cor_data, method = 'circle')

• Exploring different models

  1. I decided to take the best predictors for linear model and combine them into one multiple
first_model <- lm(PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S1.CO. + PT08.S5.O3., data = airq_data, na.action = na.exclude)
summary(first_model)
## 
## Call:
## lm(formula = PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S1.CO. + PT08.S5.O3., 
##     data = airq_data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -306.67  -97.01  -24.56   64.68 1635.25 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.547e+03  1.064e+01 145.413  < 2e-16 ***
## PT08.S2.NMHC. -3.705e-01  1.397e-02 -26.528  < 2e-16 ***
## PT08.S1.CO.   -1.031e-01  1.860e-02  -5.543 3.05e-08 ***
## PT08.S5.O3.   -2.444e-01  9.627e-03 -25.386  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 146.2 on 8987 degrees of freedom
##   (366 observations deleted due to missingness)
## Multiple R-squared:  0.676,  Adjusted R-squared:  0.6759 
## F-statistic:  6251 on 3 and 8987 DF,  p-value: < 2.2e-16
vif(first_model)
## PT08.S2.NMHC.   PT08.S1.CO.   PT08.S5.O3. 
##      5.841369      6.860634      6.189541

Looking good! Let’s check the residuals:

plot(first_model, which = c(1,2))

residuals(first_model) %>% hist()

  1. Ok, let’s check if we also use the interaction between the variables:
second_model <- lm(PT08.S3.NOx. ~ PT08.S2.NMHC. * PT08.S1.CO. * PT08.S5.O3., data = airq_data, na.action = na.exclude)
summary(second_model)
## 
## Call:
## lm(formula = PT08.S3.NOx. ~ PT08.S2.NMHC. * PT08.S1.CO. * PT08.S5.O3., 
##     data = airq_data, na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -257.33  -80.69  -25.34   56.14 1673.31 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            3.651e+03  5.933e+01   61.54   <2e-16
## PT08.S2.NMHC.                         -2.337e+00  7.511e-02  -31.11   <2e-16
## PT08.S1.CO.                           -1.945e+00  7.432e-02  -26.17   <2e-16
## PT08.S5.O3.                           -1.586e+00  5.475e-02  -28.96   <2e-16
## PT08.S2.NMHC.:PT08.S1.CO.              1.628e-03  7.397e-05   22.02   <2e-16
## PT08.S2.NMHC.:PT08.S5.O3.              1.209e-03  4.696e-05   25.75   <2e-16
## PT08.S1.CO.:PT08.S5.O3.                1.037e-03  5.128e-05   20.21   <2e-16
## PT08.S2.NMHC.:PT08.S1.CO.:PT08.S5.O3. -8.839e-07  3.398e-08  -26.01   <2e-16
##                                          
## (Intercept)                           ***
## PT08.S2.NMHC.                         ***
## PT08.S1.CO.                           ***
## PT08.S5.O3.                           ***
## PT08.S2.NMHC.:PT08.S1.CO.             ***
## PT08.S2.NMHC.:PT08.S5.O3.             ***
## PT08.S1.CO.:PT08.S5.O3.               ***
## PT08.S2.NMHC.:PT08.S1.CO.:PT08.S5.O3. ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 125.6 on 8983 degrees of freedom
##   (366 observations deleted due to missingness)
## Multiple R-squared:  0.761,  Adjusted R-squared:  0.7608 
## F-statistic:  4086 on 7 and 8983 DF,  p-value: < 2.2e-16
vif(second_model)
##                         PT08.S2.NMHC.                           PT08.S1.CO. 
##                              228.9104                              148.3410 
##                           PT08.S5.O3.             PT08.S2.NMHC.:PT08.S1.CO. 
##                              271.3158                              854.6555 
##             PT08.S2.NMHC.:PT08.S5.O3.               PT08.S1.CO.:PT08.S5.O3. 
##                              607.5205                              749.8477 
## PT08.S2.NMHC.:PT08.S1.CO.:PT08.S5.O3. 
##                              865.0315

R^2 is bigger, but now there is a multicollinearity

  1. Also, I want to check if we can make a model with 2 preditors. Maybe it would be better. This model was the best one:
third_model <- lm(PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S5.O3., data = airq_data, na.action = na.exclude)
summary(third_model)
## 
## Call:
## lm(formula = PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S5.O3., data = airq_data, 
##     na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -321.44  -96.31  -22.02   65.38 1647.49 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.498e+03  5.911e+00  253.37   <2e-16 ***
## PT08.S2.NMHC. -4.083e-01  1.221e-02  -33.42   <2e-16 ***
## PT08.S5.O3.   -2.727e-01  8.179e-03  -33.34   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 146.4 on 8988 degrees of freedom
##   (366 observations deleted due to missingness)
## Multiple R-squared:  0.6749, Adjusted R-squared:  0.6749 
## F-statistic:  9331 on 2 and 8988 DF,  p-value: < 2.2e-16
vif(third_model)
## PT08.S2.NMHC.   PT08.S5.O3. 
##        4.4527        4.4527
plot(third_model, which = c(1,2))

residuals(third_model) %>% hist()

Looks good, R^2 is the same as with 3 predictors.

  1. Also, we can try to make a model with ‘step’ function. It is not working with NA’s, so to use it I should clean a data. I delete the NMHC.GT. column to save more data. So, in the new data without NA I have 6841 rows.
summary(fourth_model)
## 
## Call:
## lm(formula = PT08.S3.NOx. ~ PT08.S5.O3. + PT08.S2.NMHC. + NOx.GT. + 
##     AH + PT08.S4.NO2. + PT08.S1.CO. + NO2.GT. + C6H6.GT. + CO.GT. + 
##     PT08.S5.O3.:PT08.S2.NMHC. + PT08.S2.NMHC.:NO2.GT. + AH:PT08.S1.CO. + 
##     PT08.S2.NMHC.:PT08.S4.NO2. + NOx.GT.:PT08.S1.CO. + PT08.S2.NMHC.:PT08.S1.CO. + 
##     PT08.S2.NMHC.:AH + PT08.S4.NO2.:PT08.S1.CO. + PT08.S5.O3.:PT08.S4.NO2. + 
##     NOx.GT.:AH + PT08.S5.O3.:C6H6.GT. + PT08.S4.NO2.:C6H6.GT. + 
##     PT08.S4.NO2.:NO2.GT. + PT08.S1.CO.:C6H6.GT. + AH:C6H6.GT. + 
##     AH:NO2.GT. + PT08.S5.O3.:AH + NOx.GT.:C6H6.GT. + NOx.GT.:PT08.S4.NO2. + 
##     PT08.S2.NMHC.:NOx.GT. + NO2.GT.:C6H6.GT. + PT08.S2.NMHC.:C6H6.GT. + 
##     PT08.S5.O3.:NO2.GT. + PT08.S1.CO.:NO2.GT. + PT08.S5.O3.:PT08.S1.CO. + 
##     PT08.S4.NO2.:CO.GT. + NOx.GT.:NO2.GT. + PT08.S5.O3.:NOx.GT. + 
##     C6H6.GT.:CO.GT. + NO2.GT.:CO.GT. + AH:CO.GT. + NOx.GT.:CO.GT. + 
##     AH:PT08.S4.NO2. + PT08.S1.CO.:CO.GT. + PT08.S2.NMHC.:CO.GT., 
##     data = airq_data_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -229.20  -43.39  -11.00   24.46 1633.78 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 4.472e+03  1.419e+02  31.509  < 2e-16 ***
## PT08.S5.O3.                -5.576e-01  1.246e-01  -4.475 7.78e-06 ***
## PT08.S2.NMHC.              -2.801e+00  2.697e-01 -10.389  < 2e-16 ***
## NOx.GT.                     1.438e+00  2.730e-01   5.269 1.41e-07 ***
## AH                         -6.798e+02  8.962e+01  -7.586 3.73e-14 ***
## PT08.S4.NO2.                1.198e+00  1.677e-01   7.142 1.01e-12 ***
## PT08.S1.CO.                -3.952e+00  2.169e-01 -18.222  < 2e-16 ***
## NO2.GT.                    -4.788e+00  8.511e-01  -5.626 1.92e-08 ***
## C6H6.GT.                    2.643e+02  1.892e+01  13.970  < 2e-16 ***
## CO.GT.                      1.858e+02  3.437e+01   5.406 6.64e-08 ***
## PT08.S5.O3.:PT08.S2.NMHC.   9.906e-04  2.215e-04   4.472 7.89e-06 ***
## PT08.S2.NMHC.:NO2.GT.       1.038e-03  1.309e-03   0.793 0.427754    
## AH:PT08.S1.CO.             -9.439e-01  7.970e-02 -11.843  < 2e-16 ***
## PT08.S2.NMHC.:PT08.S4.NO2. -4.514e-03  2.760e-04 -16.356  < 2e-16 ***
## NOx.GT.:PT08.S1.CO.         3.087e-04  1.769e-04   1.745 0.080977 .  
## PT08.S2.NMHC.:PT08.S1.CO.   2.284e-03  3.896e-04   5.862 4.77e-09 ***
## PT08.S2.NMHC.:AH            1.681e+00  1.357e-01  12.388  < 2e-16 ***
## PT08.S4.NO2.:PT08.S1.CO.    2.580e-03  1.532e-04  16.837  < 2e-16 ***
## PT08.S5.O3.:PT08.S4.NO2.   -6.974e-04  8.413e-05  -8.290  < 2e-16 ***
## NOx.GT.:AH                  8.090e-01  7.445e-02  10.867  < 2e-16 ***
## PT08.S5.O3.:C6H6.GT.       -5.509e-03  7.492e-03  -0.735 0.462179    
## PT08.S4.NO2.:C6H6.GT.       1.142e-01  8.520e-03  13.405  < 2e-16 ***
## PT08.S4.NO2.:NO2.GT.        6.990e-03  5.469e-04  12.782  < 2e-16 ***
## PT08.S1.CO.:C6H6.GT.       -1.577e-01  1.277e-02 -12.346  < 2e-16 ***
## AH:C6H6.GT.                -4.332e+01  4.967e+00  -8.720  < 2e-16 ***
## AH:NO2.GT.                 -2.434e+00  2.431e-01 -10.013  < 2e-16 ***
## PT08.S5.O3.:AH              2.008e-01  4.027e-02   4.986 6.32e-07 ***
## NOx.GT.:C6H6.GT.            7.997e-02  1.156e-02   6.920 4.93e-12 ***
## NOx.GT.:PT08.S4.NO2.       -1.088e-03  1.423e-04  -7.645 2.37e-14 ***
## PT08.S2.NMHC.:NOx.GT.      -1.857e-03  4.105e-04  -4.523 6.20e-06 ***
## NO2.GT.:C6H6.GT.           -1.512e-01  4.207e-02  -3.594 0.000328 ***
## PT08.S2.NMHC.:C6H6.GT.     -4.924e-02  7.449e-03  -6.610 4.13e-11 ***
## PT08.S5.O3.:NO2.GT.         1.561e-03  3.359e-04   4.647 3.43e-06 ***
## PT08.S1.CO.:NO2.GT.        -3.470e-03  7.353e-04  -4.720 2.41e-06 ***
## PT08.S5.O3.:PT08.S1.CO.     2.528e-04  6.636e-05   3.809 0.000140 ***
## PT08.S4.NO2.:CO.GT.        -1.367e-01  2.416e-02  -5.658 1.60e-08 ***
## NOx.GT.:NO2.GT.             2.454e-03  4.560e-04   5.381 7.64e-08 ***
## PT08.S5.O3.:NOx.GT.        -3.739e-04  9.185e-05  -4.071 4.74e-05 ***
## C6H6.GT.:CO.GT.             6.006e+00  1.622e+00   3.704 0.000214 ***
## NO2.GT.:CO.GT.             -4.678e-01  1.082e-01  -4.324 1.55e-05 ***
## AH:CO.GT.                   4.907e+01  1.526e+01   3.215 0.001309 ** 
## NOx.GT.:CO.GT.              3.801e-02  2.044e-02   1.860 0.062956 .  
## AH:PT08.S4.NO2.             4.937e-02  2.667e-02   1.851 0.064202 .  
## PT08.S1.CO.:CO.GT.          5.557e-02  2.518e-02   2.207 0.027342 *  
## PT08.S2.NMHC.:CO.GT.       -1.116e-01  6.346e-02  -1.758 0.078805 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 93.8 on 6896 degrees of freedom
## Multiple R-squared:  0.8622, Adjusted R-squared:  0.8613 
## F-statistic: 980.7 on 44 and 6896 DF,  p-value: < 2.2e-16
vif(fourth_model)
##                PT08.S5.O3.              PT08.S2.NMHC. 
##                  2024.1919                  3998.8721 
##                    NOx.GT.                         AH 
##                  2558.3458                  1019.1196 
##               PT08.S4.NO2.                PT08.S1.CO. 
##                  2769.3461                  1775.4654 
##                    NO2.GT.                   C6H6.GT. 
##                  1287.6663                 15737.0072 
##                     CO.GT.  PT08.S5.O3.:PT08.S2.NMHC. 
##                  1935.2479                 19480.9209 
##      PT08.S2.NMHC.:NO2.GT.             AH:PT08.S1.CO. 
##                  7071.4440                  1361.6993 
## PT08.S2.NMHC.:PT08.S4.NO2.        NOx.GT.:PT08.S1.CO. 
##                 31706.9947                  2566.8099 
##  PT08.S2.NMHC.:PT08.S1.CO.           PT08.S2.NMHC.:AH 
##                 33298.0434                  3779.1217 
##   PT08.S4.NO2.:PT08.S1.CO.   PT08.S5.O3.:PT08.S4.NO2. 
##                  8868.3781                  4645.5430 
##                 NOx.GT.:AH       PT08.S5.O3.:C6H6.GT. 
##                   211.9693                  9578.6010 
##      PT08.S4.NO2.:C6H6.GT.       PT08.S4.NO2.:NO2.GT. 
##                 15057.8480                  1773.1004 
##       PT08.S1.CO.:C6H6.GT.                AH:C6H6.GT. 
##                 18882.8108                  1858.7029 
##                 AH:NO2.GT.             PT08.S5.O3.:AH 
##                   122.2585                   457.8575 
##           NOx.GT.:C6H6.GT.       NOx.GT.:PT08.S4.NO2. 
##                  3687.7470                  2254.8223 
##      PT08.S2.NMHC.:NOx.GT.           NO2.GT.:C6H6.GT. 
##                 12591.6935                  2831.2905 
##     PT08.S2.NMHC.:C6H6.GT.        PT08.S5.O3.:NO2.GT. 
##                  6380.6342                   864.0077 
##        PT08.S1.CO.:NO2.GT.    PT08.S5.O3.:PT08.S1.CO. 
##                  2375.7489                  1840.3849 
##        PT08.S4.NO2.:CO.GT.            NOx.GT.:NO2.GT. 
##                  4347.9094                   309.5329 
##        PT08.S5.O3.:NOx.GT.            C6H6.GT.:CO.GT. 
##                  1082.2351                  4614.8798 
##             NO2.GT.:CO.GT.                  AH:CO.GT. 
##                   784.4240                   595.1000 
##             NOx.GT.:CO.GT.            AH:PT08.S4.NO2. 
##                   493.1602                   400.2981 
##         PT08.S1.CO.:CO.GT.       PT08.S2.NMHC.:CO.GT. 
##                  2840.9688                 17118.7947
plot(fourth_model, which = c(1,2))

residuals(fourth_model) %>% hist()

R^2 is bigger, but there is a lot of multicollinearity and the residuals looking not very good.

• So, I decided to choose the third model (with 2 predictors) as the best one. Now, I want to check if the transformation will improve the model.

R <- NULL
for (i in -5:5){
  one <- airq_data$PT08.S2.NMHC.
  two <-  airq_data$PT08.S5.O3.
  if (i < 0){
    one <- one^(i) *(-1)
    two <- two^(i) *(-1)
  }
  else if (i == 0) {
    one <- log(one)
    two <-  log(two)
  }
  else {
    one <- one^i 
    two <-  two^i 
  }
  model <- lm(PT08.S3.NOx. ~ one + two, data = airq_data, na.action = na.exclude)
  R <- c(R, summary(model)$adj.r.squared)
}
transform <- data.frame('p' = c(-5:5), 'R^2' = R)
plot(transform, type = 'b')

transform[max(transform),]
##    p       R.2
## 5 -1 0.7488992

• Making the best model and predicting the values

airq_data$PT08.S2.NMHC. <- (airq_data$PT08.S2.NMHC.)^-1 * (-1)
airq_data$PT08.S5.O3. <- (airq_data$PT08.S5.O3.)^-1 * (-1)
final_model <- lm(PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S5.O3., data = airq_data, na.action = na.exclude)
summary(final_model)
## 
## Call:
## lm(formula = PT08.S3.NOx. ~ PT08.S2.NMHC. + PT08.S5.O3., data = airq_data, 
##     na.action = na.exclude)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -390.50  -80.94  -20.36   65.08 1718.23 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    1.665e+02  5.195e+00   32.04   <2e-16 ***
## PT08.S2.NMHC. -3.891e+05  7.947e+03  -48.96   <2e-16 ***
## PT08.S5.O3.   -1.906e+05  5.208e+03  -36.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 128.7 on 8988 degrees of freedom
##   (366 observations deleted due to missingness)
## Multiple R-squared:  0.749,  Adjusted R-squared:  0.7489 
## F-statistic: 1.341e+04 on 2 and 8988 DF,  p-value: < 2.2e-16
vif(final_model)
## PT08.S2.NMHC.   PT08.S5.O3. 
##      3.926512      3.926512
plot(final_model, which = c(1,2))

residuals(final_model) %>% hist()

• Prediction:

test_subset_multiple <-  airq_data[which(row.names(airq_data) %in% sample(row.names(airq_data), 50, replace = FALSE)), c(5,10,7)]
test_multiple <- data.frame(PT08.S2.NMHC. = test_subset_multiple$PT08.S2.NMHC., PT08.S5.O3. = test_subset_multiple$PT08.S5.O3.)
test_subset_multiple$pred_PT08.S3.NOx. <- predict(final_model, newdata = test_multiple)
colnames(test_subset_multiple) <- c('real_PT08.S2.NMHC.','real_PT08.S5.O3.', 'real_PT08.S3.NOx.', 'pred_PT08.S3.NOx.')
df <- data.frame('y' = airq_data$PT08.S3.NOx. , 'x1' = airq_data$PT08.S2.NMHC., 'x2' = airq_data$PT08.S5.O3.)

### Estimation of the regression plane
mod <- lm(y ~ x1+x2, data = df)
cf.mod <- coef(mod)
df <- df %>% drop_na()
### Calculate z on a grid of x-y values
PT08.S2.NMHC. <- seq(min(df$x1),max(df$x1),length.out=25)
PT08.S5.O3. <- seq(min(df$x2),max(df$x2),length.out=25)
z <- t(outer(PT08.S2.NMHC., PT08.S5.O3., function(x,y) cf.mod[1]+cf.mod[2]*x+cf.mod[3]*y))
#### Draw the plane with "plot_ly" and add points with "add_trace"

R <- round(summary(final_model)$adj.r.squared, digits = 3)
p <- round(summary(final_model)$coefficients[2,4], digits = 3)
titl <- paste('R^2 =', as.character(R),', p-val =', as.character(p))
 

plot_ly(x=~PT08.S2.NMHC., y=~PT08.S5.O3., z=~z, type="surface", opacity=0.7)  %>%
  add_trace(data = test_subset_multiple, x = ~ real_PT08.S2.NMHC., y = ~ real_PT08.S5.O3., 
            z = ~  pred_PT08.S3.NOx., type="scatter3d", size = 1, name= "Predicted", opacity=0.8) %>%
  add_trace(data = test_subset_multiple, x = ~ real_PT08.S2.NMHC., y = ~ real_PT08.S5.O3., 
            z = ~  real_PT08.S3.NOx., type="scatter3d", size = 1, name="Real", opacity=0.8) %>% 
  layout(title = titl)
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: Ignoring 1 observations
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: Ignoring 1 observations